Read prepared data.
rr subscriptions <- read_rds(‘../data/subscriptions.rds’)
summary(subscriptions)
customerid subscriptionid periodend revenuecurr revenuecurrinclvat billingcurrency startmonth endmonth isthreetoonesubs
Min. : 10006 Min. : 154 Min. :2004-03-15 Min. : 0.8 Min. : 1 Length:2301011 Min. :2003-12-01 Min. :2004-03-01 Min. :0.00000
1st Qu.: 1109200 1st Qu.: 5195644 1st Qu.:2013-04-10 1st Qu.: 15.0 1st Qu.: 15 Class :character 1st Qu.:2013-01-01 1st Qu.:2013-04-01 1st Qu.:0.00000
Median : 5404300 Median :15011583 Median :2015-04-03 Median : 63.2 Median : 79 Mode :character Median :2014-12-01 Median :2015-04-01 Median :0.00000
Mean : 8160520 Mean :14758473 Mean :2014-09-14 Mean : 1371.7 Mean : 1393 Mean :2014-05-16 Mean :2014-08-31 Mean :0.03175
3rd Qu.:14665703 3rd Qu.:24306942 3rd Qu.:2016-11-19 3rd Qu.: 174.4 3rd Qu.: 218 3rd Qu.:2016-08-01 3rd Qu.:2016-11-01 3rd Qu.:0.00000
Max. :24113207 Max. :29648411 Max. :2020-03-27 Max. :2823000.0 Max. :2823000 Max. :2018-03-01 Max. :2020-03-01 Max. :1.00000
months status num_previous_months num_previous_subs num_previous_months_binned firstpaiddate channelcat paymentperiodchosenatstart
Min. : 1.000 Length:2301011 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :2003-12-15 Length:2301011 Min. :-1.000
1st Qu.: 1.000 Class :character 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.:2011-01-01 Class :character 1st Qu.: 3.000
Median : 3.000 Mode :character Median : 8.00 Median : 3.00 Median : 8.00 Median :2013-07-01 Mode :character Median : 3.000
Mean : 3.527 Mean : 15.33 Mean : 6.02 Mean :14.69 Mean :2013-01-30 Mean : 4.242
3rd Qu.: 3.000 3rd Qu.: 22.00 3rd Qu.: 8.00 3rd Qu.:26.00 3rd Qu.:2015-09-10 3rd Qu.: 3.000
Max. :24.000 Max. :162.00 Max. :69.00 Max. :39.00 Max. :2018-03-28 Max. :24.000
currency marketname siteverkey firstpaidmonth firstdevice segment isquickpurchase productversion isfreemium
Length:2301011 Length:2301011 Length:2301011 Min. :2003-12-01 Length:2301011 Length:2301011 Min. :0.0000 Length:2301011 Min. :0.0000
Class :character Class :character Class :character 1st Qu.:2011-01-01 Class :character Class :character 1st Qu.:0.0000 Class :character 1st Qu.:0.0000
Mode :character Mode :character Mode :character Median :2013-07-01 Mode :character Mode :character Median :1.0000 Mode :character Median :0.0000
Mean :2013-01-16 Mean :0.5291 Mean :0.2602
3rd Qu.:2015-09-01 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :2018-03-01 Max. :1.0000 Max. :1.0000
NA's :274 NA's :274
model31224 threetoonestartdate market_category siteverkey_cat siteverkey_cat2 chosen_subs_length isthreetoonestate gdppercapita gdppercapita_scaled
Length:2301011 Min. :2017-02-14 Length:2301011 Length:2301011 Length:2301011 Length:2301011 Min. :0.00000 Min. : 218.3 Min. :-2.4363
Class :character 1st Qu.:2017-03-14 Class :character Class :character Class :character Class :character 1st Qu.:0.00000 1st Qu.: 42013.3 1st Qu.:-0.4640
Mode :character Median :2017-04-18 Mode :character Mode :character Mode :character Mode :character Median :0.00000 Median : 55670.9 Median : 0.1805
Mean :2017-05-25 Mean :0.04074 Mean : 51846.7 Mean : 0.0000
3rd Qu.:2017-09-06 3rd Qu.:0.00000 3rd Qu.: 60637.3 3rd Qu.: 0.4148
Max. :2017-12-19 Max. :1.00000 Max. :108422.5 Max. : 2.6698
NA's :2144362
subscription_summary subscription_summary_no_market
Length:2301011 Length:2301011
Class :character Class :character
Mode :character Mode :character
subscriptions_with_target <- subscriptions %>%
# restrict to a recent expiry window
filter(endmonth >= begin_train_window & endmonth < end_window) %>%
mutate(num_previous_months_binned_fct = as.factor(num_previous_months_binned)) %>%
mutate(set_type = as.factor(if_else(endmonth >= begin_validation_window, 'validation', 'training'))) %>%
mutate(churnind = ifelse(status == 'churn', 1, 0))
Prepare churntable that we want to predict.
churntable <- subscriptions_with_target %>%
group_by(set_type, siteverkey_cat2, market_category, months, num_previous_months_binned, chosen_subs_length, subscription_summary_no_market) %>%
summarise(num_obs = n(),
churned = sum(churnind)) %>%
group_by(set_type) %>%
mutate(churn_rate = churned / num_obs,
renew_rate = 1 - churn_rate,
month_churn = 1 - renew_rate ^ (1/as.double(months)),
log_month_churn = log(month_churn),
weight = num_obs / sum(num_obs))
# NB! Does this introduce a bad bias ????
churntable_no_zeros <- churntable %>%
filter(churn_rate > 0)
Train model
new_model=glm(log_month_churn ~ market_category + subscription_summary_no_market, data=churntable_no_zeros[churntable_no_zeros$set_type == 'training', ], weights = weight)
write_rds(new_model, '../data/models/churn_model.rds')
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
validation_plots(prediction_table, minimal_share = 0.01)
NAs introduced by coercion
Try simple logistic model
model_logit <- glm(churnind ~ market_category + siteverkey_cat2 + num_previous_months_binned + months + chosen_subs_length,
data = subscriptions_with_target[subscriptions_with_target$set_type == 'training',], family = 'binomial')
Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:
prediction_table_logit <- validation(subscriptions_with_target, model_logit)
validation_plots(prediction_table_logit, minimal_share = 0.01)
NAs introduced by coercion